Tensorflow RNN¶

In [ ]:
!pip install note_seq
In [ ]:
import numpy as np
import tensorflow as tf
import keras
from keras import layers
import os
import zipfile
import random
from sklearn.model_selection import train_test_split
from music21 import converter, instrument, note, chord, stream
import tensorflow as tf
from google.protobuf import text_format
# from note_seq.protobuf import music_pb2
from sklearn.model_selection import train_test_split
# import tensorflow as andrew
In [ ]:
!wget "https://storage.googleapis.com/magentadata/datasets/bach-doodle/bach-doodle.jsonl-00000-of-00192.gz"
In [ ]:
###################
# Data Extraction #
###################
In [ ]:
file_count = 100000
count = 0
input_path = "bach-doodle.jsonl-00000-of-00192.gz"
dataset = []
import json
import gzip

with gzip.open(input_path, 'rb') as f_in:
  file_content = f_in.read().decode('utf-8')
  # Split the content into lines and iterate
  for line in file_content.splitlines():
      count += 1
      record = json.loads(line.strip())
      dataset.append(record)
      if count >= file_count:
          break

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
In [ ]:
################################################################################
# Task1: Symbolic, unconditioned generation                                    #
################################################################################
In [ ]:
!apt update && apt install fluidsynth -y
!brew install wget
!wget https://raw.githubusercontent.com/musescore/MuseScore/master/share/sound/FluidR3Mono_GM.sf3
!pip install midi2audio
!pip install IPython
!pip install midiutil
In [ ]:
##### Baseline: Markov Chain #####

from collections import defaultdict
import random
from numpy.random import choice
import numpy as np
import midi2audio

a = open("chords.json")
dataset = []
for l in a.readlines():
    d = eval(l)
    dataset.append(d)
flatDataset = []
for d in dataset:
    flat = []
    for part in d['chords']:
        for bar in part:
            flat += bar
    flatDataset.append(flat)
unigrams = defaultdict(int)
bigrams = defaultdict(int)
for d in flatDataset:
    for chord in d:
        unigrams[chord] += 1
    for (chord1,chord2) in zip(d[:-1],d[1:]):
        bigrams[(chord1,chord2)] += 1
unigramCounts = [(unigrams[k],k) for k in unigrams]
bigramCounts = [(bigrams[k],k) for k in bigrams]
unigramCounts.sort()
bigramCounts.sort()
dictionary = set(flatDataset[3])
transitions = defaultdict(list)
transitionProbabilities = defaultdict(list)

for b1,b2 in bigrams:
    if b1 in dictionary and b2 in dictionary:
        transitions[b1].append(b2)
        transitionProbabilities[b1].append(bigrams[(b1,b2)])

def sample(length):
    seq = [random.choice(list(transitionProbabilities.keys()))]
    while len(seq) < length:
        probs = np.array(transitionProbabilities[seq[-1]])
        if not np.isclose(probs.sum(), 1.0):
            probs = probs / probs.sum()
        nextchord = choice(transitions[seq[-1]], 1, p=probs)
        seq.append(nextchord.item())
    return seq
In [ ]:
##### Generating Output Files #####
KEY_TO_IDX = {
    'C': 0,
    'C#': 1,
    'Db': 1,
    'D': 2,
    'D#': 3,
    'Eb': 3,
    'E': 4,
    'F': 5,
    'F#': 6,
    'Gb': 6,
    'G': 7,
    'G#': 8,
    'Ab': 8,
    'A': 9,
    'A#': 10,
    'Bb': 10,
    'B': 11,
    'Cb': 11,
}

# cover some of the qualities
QUALITY_TO_INTERVAL = {
    #        1     2     3  4     5     6     7
    '':     [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],  # major
    '-':    [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0],  # minor
    '+':    [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],  # augmented
    'o':    [1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],  # diminished
    'sus':  [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],  # suspended
    '7':    [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],  # dominant 7th
    '7alt':    [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],  # dominant 7th
    'j7':   [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1],  # major 7th
    '-7':   [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0],  # minor 7th
    'o7':   [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0],  # diminished 7th
    'm7b5': [1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],  # half-diminished
    '6':    [1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],  # major 6th
    '-6':   [1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0],  # minor 6th
    }

def chord_to_notes(chord):
    if len(chord) == 1 or chord[1] not in ['b','#']:
        root = chord[:1]
        quality = chord[1:]
    else:
        root = chord[:2]
        quality = chord[2:]

    bass = root

    root_c = 60
    bass_c = 36
    root_pc = KEY_TO_IDX[root]
    if quality not in QUALITY_TO_INTERVAL:
        raise ValueError('undefined chord quality {}'.format(quality))
    chord_map = list(np.where(np.array(QUALITY_TO_INTERVAL[quality]) == 1)[0])
    bass_pc = KEY_TO_IDX[bass]

    return [bass_c + bass_pc] + [root_c + root_pc + i for i in chord_map]

from midiutil import MIDIFile

midi = MIDIFile(1) # Create a MIDI file that consists of 1 track
track = 0 # Set track number
time = 0 # Where is the event placed (at the beginning)
tempo = 120 # The tempo (beats per minute)
midi.addTempo(track, time, tempo) # Add tempo information
baseline1_chords = sample(10)
current_time = 0
default_duration = 4  # one beat
for chord in baseline1_chords:
    notes = chord_to_notes(chord)
    print(notes)
    for pitch in notes:
        midi.addNote(track, 0, pitch, current_time, default_duration, 100)
    current_time += default_duration

with open("chord_sample.mid", "wb") as f:
    midi.writeFile(f) # write MIDI file

from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth
# for i in range(len(predictions)):
fs.midi_to_audio("chord_sample.mid", "chord_sample.wav")
display(Audio("chord_sample.wav"))
[45, np.int64(69), np.int64(72), np.int64(75), np.int64(78)]
[39, np.int64(63), np.int64(67), np.int64(70), np.int64(72)]
[36, np.int64(60), np.int64(64), np.int64(67), np.int64(70)]
[36, np.int64(60), np.int64(64), np.int64(67), np.int64(70)]
[36, np.int64(60), np.int64(64), np.int64(67), np.int64(70)]
[41, np.int64(65), np.int64(69), np.int64(72), np.int64(75)]
[41, np.int64(65), np.int64(69), np.int64(72), np.int64(75)]
[41, np.int64(65), np.int64(69), np.int64(72), np.int64(75)]
[46, np.int64(70), np.int64(74), np.int64(77), np.int64(80)]
[46, np.int64(70), np.int64(74), np.int64(77), np.int64(80)]
Your browser does not support the audio element.
In [ ]:
########### Our Model ############
In [ ]:
'''
Preparing and loading the data for the training and test set
'''
sequence_length = 20

#Get a random sample of the data dataset
random.seed(0)
subset_size = 1000
subset_data = random.sample(train_data, min(subset_size, len(train_data)))

sequences = []
for record in train_data:
    #Only train on datapoints that produced good harmonies
    # e.g. when feedback = 2
    if record.get('feedback', [None])[0] == '2':
        output_seq = record.get('output_sequence', [])
        sequence = []
        for timestep in output_seq:
            notes = timestep.get('notes', [])
            if notes:
                chord = [note['pitch'] for note in notes]
                sequence.append(chord)
        if sequence:
            sequences.append(sequence)

print(f"Number of sequences with feedback=2: {len(sequences)}")

# Create vocabulary
all_pitches = set(p for seq in sequences for chord in seq for p in chord)
pitch2idx = {p: i for i, p in enumerate(sorted(all_pitches))}
idx2pitch = {i: p for p, i in pitch2idx.items()}
vocab_size = len(pitch2idx)

#Group pitches of notes together into chords
#We change the pitches into a binary one-hot encoding for better multi-categorization
sequence_vectors = []
for chord_sequence in sequences:
    if len(chord_sequence) > sequence_length:
        binary_seq = []
        for chord in chord_sequence:
            chord_vec = np.zeros(vocab_size)
            for pitch in chord:
                if pitch in pitch2idx:
                    chord_vec[pitch2idx[pitch]] = 1.0
            binary_seq.append(chord_vec)
        sequence_vectors.append(binary_seq)

# Create training data windows based on chords
X, Y = [], []
for seq in sequence_vectors:
    for i in range(len(seq) - sequence_length):
        X.append(seq[i:i+sequence_length])
        Y.append(seq[i+sequence_length])

X = np.array(X, dtype=np.float32)
Y = np.array(Y, dtype=np.float32)

print(f"X shape: {X.shape}, Y shape: {Y.shape}")
Number of sequences with feedback=2: 26446
X shape: (474, 20, 46), Y shape: (474, 46)
In [ ]:
'''
Creating an LSTM model (a more specialized version of an RNN with better
sequential data processing)
We use a bidirection model to consider the past and future chords in the sequence
in hopes to produce more accurate results
'''

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import BinaryAccuracy

model = Sequential([
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.3), input_shape=(sequence_length, vocab_size)),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
    Dense(vocab_size, activation='sigmoid', kernel_regularizer=l2(0.001))
])

model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=[BinaryAccuracy()])

model.summary()
/usr/local/lib/python3.11/dist-packages/keras/src/layers/rnn/bidirectional.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(**kwargs)
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ bidirectional (Bidirectional)   │ (None, 20, 512)        │       620,544 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional_1 (Bidirectional) │ (None, 256)            │       656,384 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 46)             │        11,822 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 1,288,750 (4.92 MB)
 Trainable params: 1,288,750 (4.92 MB)
 Non-trainable params: 0 (0.00 B)
In [ ]:
history = model.fit(X, Y, epochs=10, batch_size=32, validation_split=0.2)

# Calculate perplexity using validation loss
final_val_loss = history.history['val_loss'][-1]
val_perplexity = np.exp(final_val_loss)

print(f"Final Validation Loss: {final_val_loss:.4f}")
print(f"Perplexity: {val_perplexity:.4f}")
Epoch 1/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 20s 380ms/step - binary_accuracy: 0.6877 - loss: 0.6349 - val_binary_accuracy: 0.7625 - val_loss: 0.5310
Epoch 2/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 6s 226ms/step - binary_accuracy: 0.7625 - loss: 0.5290 - val_binary_accuracy: 0.7721 - val_loss: 0.4982
Epoch 3/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 224ms/step - binary_accuracy: 0.7719 - loss: 0.4945 - val_binary_accuracy: 0.7645 - val_loss: 0.4819
Epoch 4/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 223ms/step - binary_accuracy: 0.7766 - loss: 0.4744 - val_binary_accuracy: 0.7810 - val_loss: 0.4561
Epoch 5/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 227ms/step - binary_accuracy: 0.7900 - loss: 0.4597 - val_binary_accuracy: 0.7952 - val_loss: 0.4488
Epoch 6/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 3s 259ms/step - binary_accuracy: 0.8057 - loss: 0.4452 - val_binary_accuracy: 0.7998 - val_loss: 0.4393
Epoch 7/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 228ms/step - binary_accuracy: 0.8127 - loss: 0.4298 - val_binary_accuracy: 0.8190 - val_loss: 0.4281
Epoch 8/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 240ms/step - binary_accuracy: 0.8207 - loss: 0.4248 - val_binary_accuracy: 0.8229 - val_loss: 0.4198
Epoch 9/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 255ms/step - binary_accuracy: 0.8213 - loss: 0.4234 - val_binary_accuracy: 0.8149 - val_loss: 0.4179
Epoch 10/10
12/12 ━━━━━━━━━━━━━━━━━━━━ 5s 277ms/step - binary_accuracy: 0.8230 - loss: 0.4221 - val_binary_accuracy: 0.8192 - val_loss: 0.4214
Final Validation Loss: 0.4214
Perplexity: 1.5241
In [ ]:
import matplotlib.pyplot as plt

val_losses = history.history['val_loss']
val_perplexities = [np.exp(loss) for loss in val_losses]

#Plot perplexity
plt.plot(val_perplexities)
plt.xlabel("Epoch")
plt.ylabel("Validation Perplexity")
plt.title("Perplexity over Epochs")
plt.grid(True)
plt.show()


loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']

epochs = range(1, len(loss) + 1)

plt.figure(figsize=(12, 5))

#Plot Loss
plt.subplot(1, 2, 1)
plt.plot(epochs, loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r--', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

#Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, acc, 'b-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r--', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Binary Accuracy')
plt.legend()

plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
!wget https://raw.githubusercontent.com/musescore/MuseScore/master/share/sound/FluidR3Mono_GM.sf3
!pip install midi2audio
!pip install IPython
!pip install miditoolkit
!apt-get update
!apt-get install -y fluidsynth
In [ ]:
import miditoolkit
import subprocess
from IPython.display import Audio
In [ ]:
#Functions to generate the music piece
#Sample a binary chord vector using Bernoulli sampling with temperature scaling.
def sample_chord(probabilities, temperature=1.0):
    probs = np.clip(probabilities, 1e-8, 1 - 1e-8)
    logits = np.log(probs / (1 - probs)) / temperature
    scaled_probs = 1 / (1 + np.exp(-logits))
    return np.random.binomial(1, scaled_probs).astype(np.float32)

#Sample a chord from the top-k most probable pitches with Bernoulli sampling.
def top_k_binary_chord(probabilities, k=8, temperature=1.0):
    #Apply temperature
    probs = np.clip(probabilities, 1e-8, 1 - 1e-8)
    logits = np.log(probs / (1 - probs)) / temperature
    scaled_probs = 1 / (1 + np.exp(-logits))

    #Get top-k indices
    top_k_indices = np.argpartition(scaled_probs, -k)[-k:]
    top_k_probs = scaled_probs[top_k_indices]
    top_k_probs /= np.sum(top_k_probs)

    #Bernoulli sampling from top-k
    binary_chord = np.zeros_like(probabilities)
    sampled = np.random.binomial(1, top_k_probs)

    for i, s in zip(top_k_indices, sampled):
        binary_chord[i] = s

    if binary_chord.sum() == 0:
        binary_chord[top_k_indices[np.argmax(top_k_probs)]] = 1

    return binary_chord.astype(np.float32)

#Use LSTM to generate a new chord sequence
def generate_chord_sequence(seed_seq, length=50, temperature=1.0):
    generated = list(seed_seq)

    for _ in range(length):
        input_seq = np.array(generated[-sequence_length:]).reshape(1, sequence_length, vocab_size)
        pred = model.predict(input_seq, verbose=0)[0]
        chord = top_k_binary_chord(pred, k=8, temperature=temperature)

        if chord.sum() == 0:
            chord[np.argmax(pred)] = 1

        # Prevent exact repetition to get more varied results
        if len(generated) > 0 and np.array_equal(chord, generated[-1]):
            idx = np.random.randint(vocab_size)
            chord[idx] = 1 - chord[idx]

        generated.append(chord)

    return generated
In [ ]:
#Functions to play the music pieces from midi file

#Convert binary vector to list of MIDI pitches since we did one-hot encoding before
def binary_chord_to_pitches(binary_chord, idx2pitch):
    return [idx2pitch[i] for i, val in enumerate(binary_chord) if val > 0]

#Save the chords into a midi file
def save_chords_to_midi(chords, filename="generated.mid", velocity=80, duration=480):
    midi_obj = miditoolkit.MidiFile()
    instrument = miditoolkit.Instrument(program=0, is_drum=False, name="Generated")

    time = 0
    for chord in chords:
        for pitch in chord:
            note = miditoolkit.Note(
                velocity=velocity,
                pitch=int(pitch),
                start=time,
                end=time + duration
            )
            instrument.notes.append(note)
        time += duration

    midi_obj.instruments.append(instrument)
    midi_obj.tempo_changes.append(miditoolkit.TempoChange(120, time=0))  # 120 BPM
    midi_obj.dump(filename)
    print(f"MIDI saved to {filename}")
    return filename

#Save the chords into a midi file
#But we changed the timing and velocity up with randomization to produce less
#robotics results. We wanted to see if we could get something more natural sounding
def save_chords_to_midi_varied(chords, filename="generated.mid", base_velocity=80, base_duration=480):
    midi_obj = miditoolkit.MidiFile()
    instrument = miditoolkit.Instrument(program=0, is_drum=False, name="Generated")

    time = 0
    for chord in chords:
        # Randomize velocity and duration per chord
        velocity_variation = random.randint(-10, 10)
        duration_variation = random.randint(-60, 40)
        velocity = np.clip(base_velocity + velocity_variation, 40, 127)
        duration = max(120, base_duration + duration_variation)

        time_offset = random.randint(-10, 10)

        for pitch in chord:
            note = miditoolkit.Note(
                velocity=velocity,
                pitch=int(pitch),
                start=max(0, time + time_offset),
                end=max(0, time + time_offset + duration)
            )
            instrument.notes.append(note)

        time += base_duration

    midi_obj.instruments.append(instrument)
    midi_obj.tempo_changes.append(miditoolkit.TempoChange(120, time=0))
    midi_obj.dump(filename)
    print(f"MIDI saved to {filename}")
    return filename

#Transform the midi file into a wave file with FluidSynth so we can play the music
def midi_to_wav(midi_path, wav_path="output.wav", soundfont="FluidR3_GM.sf2"):
    subprocess.run(["fluidsynth", "-ni", soundfont, midi_path, "-F", wav_path, "-r", "44100"])
    print(f"WAV saved to {wav_path}")
    return wav_path

#Generate a new music output and diplay using IPyhton
def generate_and_play_midi(generated_binary_chords, idx2pitch,
                          midi_filename="generated.mid",
                          wav_filename="output.wav",
                          soundfont="FluidR3Mono_GM.sf2",
                          convert_to_wav=True):
    #Convert binary vectors to pitch lists
    pitch_chords = [binary_chord_to_pitches(chord, idx2pitch) for chord in generated_binary_chords]

    midi_file = save_chords_to_midi(pitch_chords, filename=midi_filename)

    if convert_to_wav:
        wav_file = midi_to_wav(midi_file, wav_path=wav_filename, soundfont=soundfont)
        display(Audio(wav_file))
    else:
        print("Error in WAV conversion and playback.")

#Generate a new music output and diplay using IPyhton with varied velocity and duration of notes
def generate_and_play_midi_varied(generated_binary_chords, idx2pitch,
                          midi_filename="generated.mid",
                          wav_filename="output.wav",
                          soundfont="FluidR3Mono_GM.sf2",
                          convert_to_wav=True):
    #Convert binary vectors to pitch lists
    pitch_chords = [binary_chord_to_pitches(chord, idx2pitch) for chord in generated_binary_chords]

    midi_file = save_chords_to_midi_varied(pitch_chords, filename=midi_filename)

    if convert_to_wav:
        wav_file = midi_to_wav(midi_file, wav_path=wav_filename, soundfont=soundfont)
        display(Audio(wav_file))
    else:
        print("Error in WAV conversion and playback.")


#We give the model a seed of zero binary arrays to start the generation process
#In the final output, we remove the random seed noise at the beginning to just
#get the unconditioned generated output
seed = [np.zeros(vocab_size, dtype=np.float32) for _ in range(sequence_length)]
generated = generate_chord_sequence(seed, length=100, temperature=1.0)
generate_and_play_midi(generated[sequence_length:], idx2pitch)
MIDI saved to generated.mid
WAV saved to output.wav
Your browser does not support the audio element.
In [ ]:
#Evalution metrics
from collections import Counter
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity

def pitch_histogram_entropy(generated_binary_chords, idx2pitch):
    all_pitches = []
    for chord in generated_binary_chords:
        all_pitches.extend(binary_chord_to_pitches(chord, idx2pitch))

    pitch_counts = Counter(all_pitches)
    probabilities = np.array(list(pitch_counts.values()), dtype=np.float32)
    probabilities /= probabilities.sum()
    return entropy(probabilities)

def chord_coverage(generated_binary_chords):
    unique_chords = set(tuple(chord) for chord in generated_binary_chords)
    return len(unique_chords)

def chord_histogram_entropy(generated_binary_chords):
    chord_strings = [tuple(chord) for chord in generated_binary_chords]
    chord_counts = Counter(chord_strings)
    probabilities = np.array(list(chord_counts.values()), dtype=np.float32)
    probabilities /= probabilities.sum()
    return entropy(probabilities)

def mean_chord_tonal_distance_cosine(generated_binary_chords):
    chords = np.array(generated_binary_chords, dtype=np.float32)
    distances = []
    for i in range(len(chords) - 1):
        if np.linalg.norm(chords[i]) > 0 and np.linalg.norm(chords[i+1]) > 0:
            cos_sim = cosine_similarity([chords[i]], [chords[i+1]])[0][0]
            distances.append(1 - cos_sim)
    return np.mean(distances) if distances else None
In [ ]:
# Modified Evalution metrics for Baseline Format

def pitch_histogram_entropy2(generated_binary_chords, idx2pitch):
    all_pitches = []
    for chord in generated_binary_chords:
        try:
            notes = chord_to_notes(chord)  # returns e.g., ['C4', 'E4', 'G4']
            # midi_pitches = [pitch.Pitch(n).midi for n in notes]
            all_pitches.extend(notes)
        except Exception as e:
            print(f"Error processing chord '{chord}': {e}")

    pitch_counts = Counter(all_pitches)
    probabilities = np.array(list(pitch_counts.values()), dtype=np.float32)
    probabilities /= probabilities.sum()
    return entropy(probabilities)

def chord_coverage2(generated_binary_chords):
    unique_chords = set(generated_binary_chords)
    return len(unique_chords)

def chord_histogram_entropy2(generated_binary_chords):
    # chord_strings = [tuple(chord) for chord in generated_binary_chords]
    chord_counts = Counter(generated_binary_chords)
    probabilities = np.array(list(chord_counts.values()), dtype=np.float32)
    probabilities /= probabilities.sum()
    return entropy(probabilities)

def chord_to_pitch_class_vector(chord_name):
    try:
        notes = chord_to_notes(chord_name)  # e.g. ['C4', 'E4', 'G4', 'B4']
        pitch_classes = [n % 12 for n in notes]
        vec = np.zeros(12)
        for pc in pitch_classes:
            vec[pc] = 1
        return vec
    except Exception as e:
        print(f"Error parsing chord '{chord_name}': {e}")
        return np.zeros(12)

def mean_chord_tonal_distance_cosine2(chord_sequence):
    chords = [chord_to_pitch_class_vector(ch) for ch in chord_sequence]
    distances = []

    for i in range(len(chords) - 1):
        if np.linalg.norm(chords[i]) > 0 and np.linalg.norm(chords[i+1]) > 0:
            cos_sim = cosine_similarity([chords[i]], [chords[i+1]])[0][0]
            distances.append(1 - cos_sim)

    return np.mean(distances) if distances else None
In [ ]:
##### Baseline Metrics #####

baseline1_pitch_entropy = pitch_histogram_entropy2(baseline1_chords, idx2pitch)
baseline1_coverage = chord_coverage2(baseline1_chords)
baseline1_chord_entropy = chord_histogram_entropy2(baseline1_chords)
baseline1_tonal_distance = mean_chord_tonal_distance_cosine2(baseline1_chords)
In [ ]:
print(baseline1_pitch_entropy)
print(baseline1_coverage)
print(baseline1_chord_entropy)
print(baseline1_tonal_distance)
2.7628703
5
1.5047883
0.25
In [ ]:
num_samples = 10
results = []

for i in range(num_samples):
    seed = [np.zeros(vocab_size, dtype=np.float32) for _ in range(sequence_length)]
    generated = generate_chord_sequence(seed, length=50, temperature=1.0)
    generated_trimmed = generated[sequence_length:]
    generate_and_play_midi(generated[sequence_length:], idx2pitch)

    pitch_entropy = pitch_histogram_entropy(generated_trimmed, idx2pitch)
    coverage = chord_coverage(generated_trimmed)
    chord_entropy = chord_histogram_entropy(generated_trimmed)
    tonal_distance = mean_chord_tonal_distance_cosine(generated_trimmed)

    results.append({
        "Song": f"Generated #{i+1}",
        "Pitch Entropy": round(pitch_entropy, 4),
        "Chord Coverage": coverage,
        "Chord Entropy": round(chord_entropy, 4),
        "Tonal Distance": round(tonal_distance, 4) if tonal_distance is not None else None
    })
In [ ]:
from tabulate import tabulate
print(tabulate(results, headers="keys", tablefmt="github"))
| Song          |   Pitch Entropy |   Chord Coverage |   Chord Entropy |   Tonal Distance |
|---------------|-----------------|------------------|-----------------|------------------|
| Generated #1  |          2.0312 |               22 |          2.6862 |           0.7776 |
| Generated #2  |          2.0325 |               21 |          2.4347 |           0.7383 |
| Generated #3  |          2.0686 |               23 |          2.633  |           0.8291 |
| Generated #4  |          1.9502 |               21 |          2.6315 |           0.8748 |
| Generated #5  |          1.9919 |               21 |          2.6251 |           0.8088 |
| Generated #6  |          2.1133 |               28 |          2.9297 |           0.7764 |
| Generated #7  |          2.051  |               25 |          2.8083 |           0.8186 |
| Generated #8  |          2.1784 |               19 |          2.4809 |           0.8583 |
| Generated #9  |          2.0098 |               25 |          2.7874 |           0.8175 |
| Generated #10 |          2.2312 |               24 |          2.8079 |           0.7396 |
In [ ]:
#Just for some more fun!
#We can play with the seed value to make up random binary input arrays with 1s, and 0s
#We can also get some intersting results varying the output velocity, duration, temperature
#We get some more staccato sounding notes now
for i in range(2):
    seed = [np.random.binomial(1, 0.1, vocab_size).astype(np.float32) for _ in range(sequence_length)]
    temperature = random.uniform(0.5, 1.5)
    print(f"Sample {i+1} | Temperature: {temperature:.2f}")
    generated_chords = generate_chord_sequence(seed,  length=50, temperature=temperature)

    midi_filename = f"generated_sample_{i+1}.mid"
    wav_filename = f"generated_sample_{i+1}.wav"

    print(f"Sample {i+1}")
    generate_and_play_midi_varied(
        generated_binary_chords=generated_chords[sequence_length:],
        idx2pitch=idx2pitch,
        midi_filename=midi_filename,
        wav_filename=wav_filename,
        soundfont="FluidR3Mono_GM.sf3"
    )
Sample 1 | Temperature: 1.36
Sample 1
MIDI saved to generated_sample_1.mid
WAV saved to generated_sample_1.wav
Your browser does not support the audio element.
Sample 2 | Temperature: 1.45
In [ ]:
#################################################
# Task 2: Conditioned Symbolic Music Generation #
#################################################
In [ ]:
####### Baseline: Conditioned Markov Chain ########
def sample_conditioned(length, required_notes=None):
    seq = [random.choice(list(transitionProbabilities.keys()))]

    for i in range(1, length):
        prev_chord = seq[-1]
        candidates = transitions[(prev_chord)]
        probs = np.array(transitionProbabilities[prev_chord], dtype=np.float64)

        if not np.isclose(probs.sum(), 1.0):
            probs /= probs.sum()

        required_note = required_notes[i] if required_notes else None

        if required_note:
            filtered = [
                (ch, p) for ch, p in zip(candidates, probs)
                if required_note in chord_to_notes(ch)
            ]
            if not filtered:
                # fallback if no chords match the required note
                nextchord = np.random.choice(candidates, p=probs)
            else:
                new_candidates, new_probs = zip(*filtered)
                new_probs = np.array(new_probs)
                new_probs /= new_probs.sum()
                nextchord = np.random.choice(new_candidates, p=new_probs)
        else:
            nextchord = np.random.choice(candidates, p=probs)

        seq.append(nextchord)
    return seq

def generate_random_midi_notes(n, low=48, high=72):
    return [random.randint(low, high) for _ in range(n)]

midi_notes = generate_random_midi_notes(10)
print(midi_notes)
chords = sample_conditioned(10, midi_notes)
[63, 65, 59, 72, 58, 51, 69, 48, 55, 55]
In [ ]:
midi = MIDIFile(1) # Create a MIDI file that consists of 1 track
track = 0 # Set track number
time = 0 # Where is the event placed (at the beginning)
tempo = 120 # The tempo (beats per minute)
midi.addTempo(track, time, tempo) # Add tempo information

current_time = 0
default_duration = 4  # one beat
for chord in chords:
    notes = chord_to_notes(chord)
    print(notes)
    for pitch in notes:
        midi.addNote(track, 0, pitch, current_time, default_duration, 100)
    current_time += default_duration

with open("conditioned_chord_sample.mid", "wb") as f:
    midi.writeFile(f) # write MIDI file


def pitch_class_dist(p, q):
    return min(abs((p % 12) - (q % 12)), 12 - abs((p % 12) - (q % 12)))


ground_truth = midi_notes

predicted_harmony = [chord_to_notes(chord)[0] for chord in chords]

correct = sum(p == t for p, t in zip(predicted_harmony, ground_truth))
accuracy = correct / len(ground_truth)

ctd_vals = [
    pitch_class_dist(predicted_harmony[i], predicted_harmony[i + 1])
    for i in range(len(predicted_harmony) - 1)
]
mean_ctd = np.mean(ctd_vals)

mctd_vals = [
    pitch_class_dist(p, t)
    for p, t in zip(predicted_harmony, ground_truth)
]
mean_mctd = np.mean(mctd_vals)

print(f"Baseline Accuracy = {accuracy:.4f}")
print(f"Baseline CTD = {mean_ctd:.4f}")
print(f"Baseline MCTD = {mean_mctd:.4f}")








from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth
# for i in range(len(predictions)):
fs.midi_to_audio("conditioned_chord_sample.mid", "conditioned_chord_sample.wav")
display(Audio("conditioned_chord_sample.wav"))
[44, np.int64(68), np.int64(72), np.int64(75), np.int64(78)]
[36, np.int64(60), np.int64(64), np.int64(67), np.int64(70)]
[41, np.int64(65), np.int64(69), np.int64(72), np.int64(75)]
[41, np.int64(65), np.int64(69), np.int64(72), np.int64(75)]
[46, np.int64(70), np.int64(73), np.int64(77), np.int64(80)]
[39, np.int64(63), np.int64(67), np.int64(70), np.int64(73)]
[41, np.int64(65), np.int64(69), np.int64(72), np.int64(75)]
[46, np.int64(70), np.int64(74), np.int64(77), np.int64(80)]
[46, np.int64(70), np.int64(74), np.int64(77), np.int64(80)]
[39, np.int64(63), np.int64(67), np.int64(70), np.int64(73)]
Baseline Accuracy = 0.0000
Baseline CTD = 3.4444
Baseline MCTD = 3.4000
Your browser does not support the audio element.
In [ ]:
#### data extraction ####
file_count = 100000
count = 0
input_path = "bach-doodle.jsonl-00000-of-00192.gz"
dataset = []
import json
import gzip

with gzip.open(input_path, 'rb') as f_in:
  file_content = f_in.read().decode('utf-8')
  for line in file_content.splitlines():
      count += 1
      record = json.loads(line.strip())
      dataset.append(record)
      if count >= file_count:
          break

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
In [ ]:
# ===== Full Seq2Seq Harmonizer with Chord Output (fixed Attention reuse) =====
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense, Bidirectional, Dropout,
    Attention, Concatenate, TimeDistributed, LayerNormalization
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import json
import gzip
import mido
from mido import Message, MidiFile, MidiTrack

FILE_COUNT = 100_000
SEQUENCE_LENGTH = 20
EMBED_DIM = 128
ENC_UNITS = 512
BATCH_SIZE = 64
EPOCHS = 30
VELOCITY = 80
DURATION_TICKS = 480
TEMPO = 500000

input_path = "bach-doodle.jsonl-00000-of-00192.gz"
raw_records = []
count = 0
with gzip.open(input_path, 'rb') as f_in:
    for line in f_in.read().decode('utf-8').splitlines():
        raw_records.append(json.loads(line))
        count += 1
        if count >= FILE_COUNT:
            break

train_records, test_records = train_test_split(raw_records, test_size=0.2, random_state=42)

melody_seqs = []
harmony_seqs = []

for record in train_records:
    inputs = record.get('input_sequence', [])
    outputs = record.get('output_sequence', [])
    for m_seq, h_seq in zip(inputs, outputs):
        mel_p = [n['pitch'] for n in m_seq.get('notes', [])]
        har_p = [n['pitch'] for n in h_seq.get('notes', [])]
        if len(mel_p) >= SEQUENCE_LENGTH and len(har_p) >= SEQUENCE_LENGTH:
            melody_seqs.append(mel_p)
            harmony_seqs.append(har_p)

all_pitches = set(p for seq in (melody_seqs + harmony_seqs) for p in seq)
pitch2idx = {p: i + 1 for i, p in enumerate(sorted(all_pitches))}
idx2pitch = {i: p for p, i in pitch2idx.items()}
vocab_size = len(pitch2idx) + 1

def encode_and_pad(sequences, length):
    enc = [[pitch2idx[p] for p in seq] for seq in sequences]
    return pad_sequences(enc, maxlen=length, padding='pre', truncating='pre')

melody_encoded = encode_and_pad(melody_seqs, SEQUENCE_LENGTH)
harmony_encoded = encode_and_pad(harmony_seqs, SEQUENCE_LENGTH)

decoder_input = harmony_encoded[:, :-1]
decoder_target = harmony_encoded[:, 1:]


Xmel_tr, Xmel_val, Xdec_tr, Xdec_val, Ydec_tr, Ydec_val = train_test_split(
    melody_encoded, decoder_input, decoder_target, test_size=0.1, random_state=42
)


encoder_inputs = Input(shape=(SEQUENCE_LENGTH,), name="encoder_inputs")
enc_emb = Embedding(
    input_dim=vocab_size,
    output_dim=EMBED_DIM,
    mask_zero=False,
    name="encoder_embedding"
)(encoder_inputs)
enc_emb_norm = LayerNormalization(name="encoder_norm")(enc_emb)
encoder_bi = Bidirectional(
    LSTM(ENC_UNITS, return_sequences=True, return_state=True),
    name="encoder_bidirectional"
)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bi(enc_emb_norm)
state_h = Concatenate(name="enc_state_h")([forward_h, backward_h])
state_c = Concatenate(name="enc_state_c")([forward_c, backward_c])

decoder_inputs = Input(shape=(SEQUENCE_LENGTH - 1,), name="decoder_inputs")
dec_emb = Embedding(
    input_dim=vocab_size,
    output_dim=EMBED_DIM,
    mask_zero=False,
    name="decoder_embedding"
)(decoder_inputs)
dec_emb_dropout = Dropout(0.3, name="decoder_dropout")(dec_emb)

decoder_lstm_layer = LSTM(
    2 * ENC_UNITS, return_sequences=True, return_state=True, name="decoder_lstm"
)
dec_outputs, dec_h, dec_c = decoder_lstm_layer(dec_emb_dropout, initial_state=[state_h, state_c])

attention_layer = Attention(name="attention_layer")
attn_out = attention_layer([dec_outputs, encoder_outputs])
dec_concat = Concatenate(axis=-1, name="dec_concat")([dec_outputs, attn_out])

dec_dense = TimeDistributed(Dense(vocab_size, activation='softmax'), name="decoder_dense")
dec_pred = dec_dense(dec_concat)

model = Model([encoder_inputs, decoder_inputs], dec_pred)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1
)
early_stop = EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
)

model.fit(
    [Xmel_tr, Xdec_tr],
    Ydec_tr,
    validation_data=([Xmel_val, Xdec_val], Ydec_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[reduce_lr, early_stop]
)

encoder_model_inf = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

inf_decoder_inputs = Input(shape=(1,), name="inf_decoder_input")  # one step
inf_state_h = Input(shape=(2 * ENC_UNITS,), name="inf_state_h")
inf_state_c = Input(shape=(2 * ENC_UNITS,), name="inf_state_c")
inf_encoder_outputs = Input(shape=(SEQUENCE_LENGTH, 2 * ENC_UNITS), name="inf_encoder_outputs")

inf_emb = model.get_layer("decoder_embedding")(inf_decoder_inputs)
inf_lstm_out, inf_h, inf_c = decoder_lstm_layer(inf_emb, initial_state=[inf_state_h, inf_state_c])
attn_inf = attention_layer([inf_lstm_out, inf_encoder_outputs])
concat_inf = Concatenate(name="inf_concat", axis=-1)([inf_lstm_out, attn_inf])
inf_pred = model.get_layer("decoder_dense")(concat_inf)

decoder_model_inf = Model(
    [inf_decoder_inputs, inf_state_h, inf_state_c, inf_encoder_outputs],
    [inf_pred, inf_h, inf_c]
)

def generate_harmony(melody_seq, max_len=SEQUENCE_LENGTH):
    encoder_outs, h_enc, c_enc = encoder_model_inf.predict(
        melody_seq.reshape(1, -1), verbose=0
    )
    states = [h_enc, c_enc]
    target_seq = np.array([[0]], dtype='int32')
    generated_indices = []

    for _ in range(max_len):
        pred_probs, h_dec, c_dec = decoder_model_inf.predict(
            [target_seq] + states + [encoder_outs], verbose=0
        )
        sampled_idx = np.argmax(pred_probs[0, -1, :])
        if sampled_idx == 0:
            break
        generated_indices.append(sampled_idx)
        target_seq = np.array([[sampled_idx]], dtype='int32')
        states = [h_dec, c_dec]

    return [idx2pitch[idx] for idx in generated_indices]

sample_mel = Xmel_val[3]
generated_harmony = generate_harmony(sample_mel, max_len=SEQUENCE_LENGTH)

print("Seed melody pitches:", [idx2pitch[i] for i in sample_mel if i != 0])
print("Generated harmony pitches:", generated_harmony)

def save_harmonized_chords_mido(
    melody_pitches, harmony_pitches,
    filename="harmonized_chords.mid",
    velocity=VELOCITY, duration_ticks=DURATION_TICKS, tempo=TEMPO
):
    mid = MidiFile(ticks_per_beat=480)
    track = MidiTrack()
    mid.tracks.append(track)
    track.append(mido.MetaMessage("set_tempo", tempo=tempo))

    length = min(len(melody_pitches), len(harmony_pitches))
    for i in range(length):
        m_pitch = melody_pitches[i]
        h_pitch = harmony_pitches[i]
        notes_to_add = set([m_pitch, h_pitch])


        for pitch in notes_to_add:
            track.append(Message("note_on", note=int(pitch), velocity=velocity, time=0))
        first_off = True
        for pitch in notes_to_add:
            if first_off:
                track.append(Message("note_off", note=int(pitch), velocity=0, time=duration_ticks))
                first_off = False
            else:
                track.append(Message("note_off", note=int(pitch), velocity=0, time=0))

    mid.save(filename)

melody_pitches = [idx2pitch[i] for i in sample_mel if i != 0]
save_harmonized_chords_mido(
    melody_pitches=melody_pitches,
    harmony_pitches=generated_harmony,
    filename="harmonized_chords.mid"
)
model.save("seq2seq_harmonizer_with_attention.keras")
Model: "functional_1"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)        ┃ Output Shape      ┃    Param # ┃ Connected to      ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ encoder_inputs      │ (None, 20)        │          0 │ -                 │
│ (InputLayer)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ encoder_embedding   │ (None, 20, 128)   │      6,016 │ encoder_inputs[0… │
│ (Embedding)         │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ decoder_inputs      │ (None, 19)        │          0 │ -                 │
│ (InputLayer)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ encoder_norm        │ (None, 20, 128)   │        256 │ encoder_embeddin… │
│ (LayerNormalizatio… │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ decoder_embedding   │ (None, 19, 128)   │      6,016 │ decoder_inputs[0… │
│ (Embedding)         │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ encoder_bidirectio… │ [(None, 20,       │  2,625,536 │ encoder_norm[0][… │
│ (Bidirectional)     │ 1024), (None,     │            │                   │
│                     │ 512), (None,      │            │                   │
│                     │ 512), (None,      │            │                   │
│                     │ 512), (None,      │            │                   │
│                     │ 512)]             │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ decoder_dropout     │ (None, 19, 128)   │          0 │ decoder_embeddin… │
│ (Dropout)           │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ enc_state_h         │ (None, 1024)      │          0 │ encoder_bidirect… │
│ (Concatenate)       │                   │            │ encoder_bidirect… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ enc_state_c         │ (None, 1024)      │          0 │ encoder_bidirect… │
│ (Concatenate)       │                   │            │ encoder_bidirect… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ decoder_lstm (LSTM) │ [(None, 19,       │  4,722,688 │ decoder_dropout[… │
│                     │ 1024), (None,     │            │ enc_state_h[0][0… │
│                     │ 1024), (None,     │            │ enc_state_c[0][0] │
│                     │ 1024)]            │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ attention_layer     │ (None, 19, 1024)  │          0 │ decoder_lstm[0][… │
│ (Attention)         │                   │            │ encoder_bidirect… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dec_concat          │ (None, 19, 2048)  │          0 │ decoder_lstm[0][… │
│ (Concatenate)       │                   │            │ attention_layer[… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ decoder_dense       │ (None, 19, 47)    │     96,303 │ dec_concat[0][0]  │
│ (TimeDistributed)   │                   │            │                   │
└─────────────────────┴───────────────────┴────────────┴───────────────────┘
 Total params: 7,456,815 (28.45 MB)
 Trainable params: 7,456,815 (28.45 MB)
 Non-trainable params: 0 (0.00 B)
Epoch 1/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 12s 56ms/step - accuracy: 0.1034 - loss: 3.3125 - val_accuracy: 0.1438 - val_loss: 2.7514 - learning_rate: 1.0000e-04
Epoch 2/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 39ms/step - accuracy: 0.1605 - loss: 2.6806 - val_accuracy: 0.1776 - val_loss: 2.6196 - learning_rate: 1.0000e-04
Epoch 3/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 37ms/step - accuracy: 0.1952 - loss: 2.5278 - val_accuracy: 0.2070 - val_loss: 2.4812 - learning_rate: 1.0000e-04
Epoch 4/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 2s 35ms/step - accuracy: 0.2323 - loss: 2.4002 - val_accuracy: 0.2464 - val_loss: 2.3687 - learning_rate: 1.0000e-04
Epoch 5/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 35ms/step - accuracy: 0.2783 - loss: 2.2889 - val_accuracy: 0.2798 - val_loss: 2.2875 - learning_rate: 1.0000e-04
Epoch 6/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 2s 34ms/step - accuracy: 0.3064 - loss: 2.2035 - val_accuracy: 0.2980 - val_loss: 2.2555 - learning_rate: 1.0000e-04
Epoch 7/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step - accuracy: 0.3233 - loss: 2.1646 - val_accuracy: 0.3077 - val_loss: 2.2310 - learning_rate: 1.0000e-04
Epoch 8/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step - accuracy: 0.3322 - loss: 2.1423 - val_accuracy: 0.3142 - val_loss: 2.2154 - learning_rate: 1.0000e-04
Epoch 9/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 5s 35ms/step - accuracy: 0.3389 - loss: 2.1170 - val_accuracy: 0.3254 - val_loss: 2.1949 - learning_rate: 1.0000e-04
Epoch 10/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 35ms/step - accuracy: 0.3511 - loss: 2.0817 - val_accuracy: 0.3251 - val_loss: 2.1866 - learning_rate: 1.0000e-04
Epoch 11/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 40ms/step - accuracy: 0.3568 - loss: 2.0651 - val_accuracy: 0.3321 - val_loss: 2.1745 - learning_rate: 1.0000e-04
Epoch 12/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 5s 37ms/step - accuracy: 0.3695 - loss: 2.0370 - val_accuracy: 0.3347 - val_loss: 2.1656 - learning_rate: 1.0000e-04
Epoch 13/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 5s 36ms/step - accuracy: 0.3733 - loss: 2.0185 - val_accuracy: 0.3391 - val_loss: 2.1619 - learning_rate: 1.0000e-04
Epoch 14/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 38ms/step - accuracy: 0.3792 - loss: 2.0060 - val_accuracy: 0.3359 - val_loss: 2.1597 - learning_rate: 1.0000e-04
Epoch 15/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step - accuracy: 0.3858 - loss: 1.9869 - val_accuracy: 0.3376 - val_loss: 2.1612 - learning_rate: 1.0000e-04
Epoch 16/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step - accuracy: 0.3945 - loss: 1.9588 - val_accuracy: 0.3430 - val_loss: 2.1519 - learning_rate: 1.0000e-04
Epoch 17/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step - accuracy: 0.4003 - loss: 1.9424 - val_accuracy: 0.3520 - val_loss: 2.1466 - learning_rate: 1.0000e-04
Epoch 18/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 5s 37ms/step - accuracy: 0.4050 - loss: 1.9231 - val_accuracy: 0.3440 - val_loss: 2.1580 - learning_rate: 1.0000e-04
Epoch 19/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 0s 33ms/step - accuracy: 0.4075 - loss: 1.8986
Epoch 19: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step - accuracy: 0.4075 - loss: 1.8987 - val_accuracy: 0.3498 - val_loss: 2.1501 - learning_rate: 1.0000e-04
Epoch 20/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 5s 37ms/step - accuracy: 0.4195 - loss: 1.8685 - val_accuracy: 0.3473 - val_loss: 2.1513 - learning_rate: 5.0000e-05
Epoch 21/30
70/71 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step - accuracy: 0.4310 - loss: 1.8369
Epoch 21: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
71/71 ━━━━━━━━━━━━━━━━━━━━ 5s 37ms/step - accuracy: 0.4309 - loss: 1.8371 - val_accuracy: 0.3498 - val_loss: 2.1497 - learning_rate: 5.0000e-05
Epoch 22/30
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 35ms/step - accuracy: 0.4387 - loss: 1.8123 - val_accuracy: 0.3451 - val_loss: 2.1566 - learning_rate: 2.5000e-05
Epoch 22: early stopping
Restoring model weights from the end of the best epoch: 17.
Seed melody pitches: [69, 64, 69, 66, 69, 62, 67, 66, 69, 71, 74, 78, 76, 76, 69, 69, 69, 67, 74, 67]
Generated harmony pitches: [60, 59, 60, 62, 64, 62, 64, 62, 50, 49, 50, 48, 50, 52, 50, 48, 47, 45, 47, 45]
In [ ]:
#output
from IPython.display import Audio

!apt-get install -y fluidsynth
!pip install pyfluidsynth

import subprocess
def midi_to_wav(midi_path, wav_path="output.wav", soundfont="example.sf2"):
    subprocess.run(["fluidsynth", "-ni", soundfont, midi_path, "-F", wav_path, "-r", "44100"])
    return wav_path
sf2_path = "example.sf2"
midi_path = "harmonized_chords.mid"
wav_path = midi_to_wav(midi_path, soundfont=sf2_path)

Audio(wav_path)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fluidsynth is already the newest version (2.2.5-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
Collecting pyfluidsynth
  Downloading pyfluidsynth-1.3.4-py3-none-any.whl.metadata (7.5 kB)
Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from pyfluidsynth) (2.0.2)
Downloading pyfluidsynth-1.3.4-py3-none-any.whl (22 kB)
Installing collected packages: pyfluidsynth
Successfully installed pyfluidsynth-1.3.4
Out[ ]:
Your browser does not support the audio element.
In [ ]:
#input
import mido
from mido import Message, MidiFile, MidiTrack
import subprocess
from IPython.display import Audio
melody_pitches = [idx2pitch[i] for i in sample_mel if i != 0]

def save_melody_midi(pitches, filename="input_melody.mid", velocity=VELOCITY, duration_ticks=DURATION_TICKS, tempo=TEMPO):
    mid = MidiFile(ticks_per_beat=480)
    track = MidiTrack()
    mid.tracks.append(track)
    track.append(mido.MetaMessage("set_tempo", tempo=tempo))
    for pitch in pitches:
        track.append(Message("note_on",  note=int(pitch), velocity=velocity, time=0))
        track.append(Message("note_off", note=int(pitch), velocity=0,   time=duration_ticks))
    mid.save(filename)

save_melody_midi(melody_pitches, filename="input_melody.mid")

def midi_to_wav(midi_path, wav_path="input.wav", soundfont="example.sf2"):
    subprocess.run([
        "fluidsynth", "-ni", soundfont,
        midi_path, "-F", wav_path, "-r", "44100"
    ], check=True)
    return wav_path
wav_file = midi_to_wav("input_melody.mid", wav_path="input.wav", soundfont="example.sf2")

Audio(wav_file)
Out[ ]:
Your browser does not support the audio element.
In [ ]:
!pip install tensorflow
In [ ]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json, gzip
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

FILE_COUNT = 100_000
SEQUENCE_LENGTH = 20

raw_records = []
with gzip.open("bach-doodle.jsonl-00000-of-00192.gz", "rb") as f:
    for i, line in enumerate(f.read().decode("utf-8").splitlines()):
        raw_records.append(json.loads(line))
        if i + 1 >= FILE_COUNT:
            break

train_records, _ = train_test_split(raw_records, test_size=0.2, random_state=42)

melody_seqs, harmony_seqs = [], []
for rec in train_records:
    for m_seq, h_seq in zip(rec.get("input_sequence", []), rec.get("output_sequence", [])):
        mel_p = [n["pitch"] for n in m_seq.get("notes", [])]
        har_p = [n["pitch"] for n in h_seq.get("notes", [])]
        if len(mel_p) >= SEQUENCE_LENGTH and len(har_p) >= SEQUENCE_LENGTH:
            melody_seqs.append(mel_p)
            harmony_seqs.append(har_p)

all_pitches = set(p for seq in melody_seqs + harmony_seqs for p in seq)
pitch2idx = {p: i + 1 for i, p in enumerate(sorted(all_pitches))}
idx2pitch = {i: p for p, i in pitch2idx.items()}

def encode_and_pad(seqs, length):
    return pad_sequences([[pitch2idx[p] for p in seq] for seq in seqs],
                         maxlen=length, padding="pre", truncating="pre")

melody_encoded  = encode_and_pad(melody_seqs, SEQUENCE_LENGTH)
harmony_encoded = encode_and_pad(harmony_seqs, SEQUENCE_LENGTH)
decoder_input   = harmony_encoded[:, :-1]
decoder_target  = harmony_encoded[:,  1:]

Xmel_tr, Xmel_val, Xdec_tr, Xdec_val, Ydec_tr, Ydec_val = train_test_split(
    melody_encoded, decoder_input, decoder_target, test_size=0.1, random_state=42
)

model = load_model("seq2seq_harmonizer_with_attention.keras", compile=False)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])

y_pred_probs = model.predict([Xmel_val, Xdec_val], verbose=0)
y_pred_indices = np.argmax(y_pred_probs, axis=-1)

def pitch_class_dist(p, q):
    return min(abs((p % 12) - (q % 12)), 12 - abs((p % 12) - (q % 12)))
sequence_data = []
for i in range(len(Xmel_val)):
    preds = y_pred_indices[i]
    target = Ydec_val[i]
    mask = target != 0
    acc = np.sum(preds[mask] == target[mask]) / np.sum(mask) if np.sum(mask) > 0 else 0

    pred_pitches = [idx2pitch[idx] for idx in preds if idx != 0]
    ctd = np.mean([pitch_class_dist(p1, p2)
                   for p1, p2 in zip(pred_pitches[:-1], pred_pitches[1:])]) if len(pred_pitches) > 1 else 0

    mel_seq = Xmel_val[i]
    mctd_vals = []
    for j, idx in enumerate(preds):
        if idx == 0 or j+1 >= len(mel_seq) or mel_seq[j+1] == 0:
            continue
        mctd_vals.append(pitch_class_dist(idx2pitch[idx], idx2pitch[mel_seq[j+1]]))
    mctd = np.mean(mctd_vals) if mctd_vals else 0

    sequence_data.append({"Seq #": i, "Accuracy": acc, "CTD": ctd, "MCTD": mctd})

df = pd.DataFrame(sequence_data)
print(df)

print("\n=== Averages ===")
print(f"Mean Accuracy = {df['Accuracy'].mean():.4f}")
print(f"Mean CTD      = {df['CTD'].mean():.4f}")
print(f"Mean MCTD     = {df['MCTD'].mean():.4f}")
     Seq #  Accuracy       CTD      MCTD
0        0  0.526316  2.166667  3.578947
1        1  0.368421  2.500000  2.736842
2        2  0.473684  0.444444  1.052632
3        3  0.315789  2.611111  3.789474
4        4  0.473684  1.500000  2.736842
..     ...       ...       ...       ...
498    498  0.210526  2.222222  3.526316
499    499  0.526316  2.666667  3.105263
500    500  0.368421  1.666667  2.789474
501    501  0.368421  2.500000  2.684211
502    502  0.526316  2.555556  3.263158

[503 rows x 4 columns]

=== Averages ===
Mean Accuracy = 0.3520
Mean CTD      = 2.2772
Mean MCTD     = 3.0331
In [ ]:
#baseline
import numpy as np
import random
from midiutil import MIDIFile
from IPython.display import Audio, display
from midi2audio import FluidSynth

# === Assumes these are already defined in your environment ===
# - Xmel_val, Ydec_val
# - idx2pitch
# - transitions: dict from chord → list of next chord candidates
# - transitionProbabilities: dict from chord → probability list
# - chord_to_notes: function that maps chord name → list of MIDI note integers

def pitch_class_dist(p, q):
    return min(abs((p % 12) - (q % 12)), 12 - abs((p % 12) - (q % 12)))

def sample_conditioned(length, required_notes=None):
    seq = [random.choice(list(transitionProbabilities.keys()))]
    for i in range(1, length):
        prev_chord = seq[-1]
        candidates = transitions.get(prev_chord, [])
        probs = np.array(transitionProbabilities.get(prev_chord, []), dtype=np.float64)

        if not candidates or not probs.any():
            seq.append(random.choice(list(transitionProbabilities.keys())))
            continue

        if not np.isclose(probs.sum(), 1.0):
            probs /= probs.sum()

        required_note = required_notes[i] if required_notes and i < len(required_notes) else None

        if required_note:
            filtered = [
                (ch, p) for ch, p in zip(candidates, probs)
                if required_note in chord_to_notes(ch)
            ]
            if filtered:
                new_candidates, new_probs = zip(*filtered)
                new_probs = np.array(new_probs)
                new_probs /= new_probs.sum()
                nextchord = np.random.choice(new_candidates, p=new_probs)
            else:
                nextchord = np.random.choice(candidates, p=probs)
        else:
            nextchord = np.random.choice(candidates, p=probs)

        seq.append(nextchord)
    return seq

# === Extract one real sample from validation set ===
val_index = 0  # You can loop over more samples later
melody_input = Xmel_val[val_index]
ground_truth_target = Ydec_val[val_index]

melody_notes = [idx2pitch[i] for i in melody_input if i != 0]
harmony_notes = [idx2pitch[i] for i in ground_truth_target if i != 0]

# === Generate predicted chords using the baseline model ===
chords = sample_conditioned(len(melody_notes), required_notes=melody_notes)
predicted_harmony = [chord_to_notes(ch)[0] for ch in chords]

# === Save as MIDI ===
midi = MIDIFile(1)
track = 0
midi.addTempo(track, 0, 120)

current_time = 0
default_duration = 4
for chord in chords:
    notes = chord_to_notes(chord)
    for pitch in notes:
        midi.addNote(track, 0, pitch, current_time, default_duration, 100)
    current_time += default_duration

with open("conditioned_chord_sample.mid", "wb") as f:
    midi.writeFile(f)

# === Evaluate ===
ground_truth = harmony_notes

# Accuracy

min_len = min(len(ground_truth), len(chords))
ground_truth = ground_truth[:min_len]
chords = chords[:min_len]

ground_truth = [int(t) for t in ground_truth]
all_pitches = []
for ch in chords:
    all_pitches.extend(chord_to_notes(ch))
correct = sum(t == ch for ch, t in zip(all_pitches, ground_truth))
accuracy = correct / len(ground_truth)


# CTD
ctd_vals = [
    pitch_class_dist(predicted_harmony[i], predicted_harmony[i + 1])
    for i in range(len(predicted_harmony) - 1)
]
mean_ctd = np.mean(ctd_vals)

# MCTD
mctd_vals = [
    pitch_class_dist(p, t)
    for p, t in zip(predicted_harmony, ground_truth)
]
mean_mctd = np.mean(mctd_vals)

# Print results
print(f"Baseline Accuracy = {accuracy:.4f}")
print(f"Baseline CTD = {mean_ctd:.4f}")
print(f"Baseline MCTD = {mean_mctd:.4f}")

# === Optional: Convert to audio and play ===
fs = FluidSynth("FluidR3Mono_GM.sf3")  # path to your .sf3 soundfont
fs.midi_to_audio("conditioned_chord_sample.mid", "conditioned_chord_sample.wav")
display(Audio("conditioned_chord_sample.wav"))
Baseline Accuracy = 0.0526
Baseline CTD = 2.1579
Baseline MCTD = 2.0000
Your browser does not support the audio element.
In [ ]:
print(ground_truth[:10])
print(all_pitches[:10])
[61, 63, 65, 63, 61, 60, 58, 56, 54, 55]
[39, np.int64(63), np.int64(67), np.int64(70), np.int64(73), 44, np.int64(68), np.int64(72), np.int64(75), np.int64(78)]
In [ ]: